{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Monte Carlo Method\n",
    "\n",
    "As a simple example of a Monte Carlo method, we will approximate the value of $\\pi$:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pyopencl as cl\n",
    "import pyopencl.array\n",
    "import pyopencl.clrandom\n",
    "import loopy as lp\n",
    "\n",
    "from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Choose platform:\n",
      "[0] <pyopencl.Platform 'Portable Computing Language' at 0x7fa2c133d6e8>\n",
      "[1] <pyopencl.Platform 'Intel(R) OpenCL' at 0x2be2948>\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      "Choice [0]: \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Set the environment variable PYOPENCL_CTX='' to avoid being asked again.\n"
     ]
    }
   ],
   "source": [
    "ctx = cl.create_some_context(interactive=True)\n",
    "queue = cl.CommandQueue(ctx)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sample generation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "knl = lp.make_kernel(\n",
    "        \"{ [i, j]: 0<=i<n and 0<=j < 2}\",\n",
    "        \"\"\"\n",
    "        <> ctr = make_uint4(0, 1, 2, 3)\n",
    "        for i\n",
    "            <> key2 = make_uint2(i, k1)\n",
    "            <> rng_res, <> dummy = philox4x32_f32(ctr, key2)\n",
    "\n",
    "            samples[i,0] = rng_res.s0 + 1j*rng_res.s1 {id=samp0}\n",
    "            samples[i,1] = rng_res.s2 + 1j*rng_res.s3 {id=samp1}\n",
    "\n",
    "            accepted[i,j] = real(samples[i,j] * conj(samples[i,j])) < 1 {dep=samp*,nosync=samp*}\n",
    "        end\n",
    "        \"\"\")                                                                     \n",
    "\n",
    "knl = lp.split_iname(knl, \"i\", 128, outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
    "knl = lp.set_options(knl, return_dict=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "---------------------------------------------------------------------------\n",
      "KERNEL: loopy_kernel\n",
      "---------------------------------------------------------------------------\n",
      "ARGUMENTS:\n",
      "accepted: type: <auto/runtime>, shape: (n, 2), dim_tags: (N1:stride:2, N0:stride:1) aspace: global\n",
      "k1: ValueArg, type: <auto/runtime>\n",
      "n: ValueArg, type: <auto/runtime>\n",
      "samples: type: <auto/runtime>, shape: (n, 2), dim_tags: (N1:stride:2, N0:stride:1) aspace: global\n",
      "---------------------------------------------------------------------------\n",
      "DOMAINS:\n",
      "[n] -> { [j, i_outer, i_inner] : 0 <= j <= 1 and i_inner >= 0 and -128i_outer <= i_inner <= 127 and i_inner < n - 128i_outer }\n",
      "---------------------------------------------------------------------------\n",
      "INAME IMPLEMENTATION TAGS:\n",
      "i_inner: l.0\n",
      "i_outer: g.0\n",
      "j: None\n",
      "---------------------------------------------------------------------------\n",
      "TEMPORARIES:\n",
      "ctr: type: <auto/runtime>, shape: () scope:auto\n",
      "dummy: type: <auto/runtime>, shape: () scope:auto\n",
      "key2: type: <auto/runtime>, shape: () scope:auto\n",
      "rng_res: type: <auto/runtime>, shape: () scope:auto\n",
      "---------------------------------------------------------------------------\n",
      "INSTRUCTIONS:\n",
      "↱   \u001b[36mctr\u001b[0m = \u001b[35mmake_uint4(0, 1, 2, 3)\u001b[0m  {id=\u001b[32minsn\u001b[0m}\n",
      "│   for i_inner, i_outer\n",
      "│↱      \u001b[36mkey2\u001b[0m = \u001b[35mmake_uint2(i_inner + i_outer*128, k1)\u001b[0m  {id=\u001b[32minsn_0\u001b[0m}\n",
      "└└↱     \u001b[36mrng_res, dummy\u001b[0m = \u001b[35mphilox4x32_f32(ctr, key2)\u001b[0m  {id=\u001b[32minsn_1\u001b[0m}\n",
      "↱ ├     \u001b[36msamples[i_inner + i_outer*128, 0]\u001b[0m = \u001b[35mrng_res.s0 + 1j*rng_res.s1\u001b[0m  {id=\u001b[32msamp0\u001b[0m}\n",
      "│↱└     \u001b[36msamples[i_inner + i_outer*128, 1]\u001b[0m = \u001b[35mrng_res.s2 + 1j*rng_res.s3\u001b[0m  {id=\u001b[32msamp1\u001b[0m}\n",
      "││      for j\n",
      "└└        \u001b[36maccepted[i_inner + i_outer*128, j]\u001b[0m = \u001b[35mreal(samples[i_inner + i_outer*128, j]*conj(samples[i_inner + i_outer*128, j])) < 1\u001b[0m  {id=\u001b[32minsn_2\u001b[0m, no_sync_with=samp0@any:samp1@any}\n",
      "    end i_inner, i_outer, j\n",
      "---------------------------------------------------------------------------\n"
     ]
    }
   ],
   "source": [
    "print(knl)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "knl = lp.set_options(knl, write_cl=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mif __OPENCL_C_VERSION__ < 120\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mpragma OPENCL EXTENSION cl_khr_fp64: enable\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mendif\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine PYOPENCL_DEFINE_CDOUBLE\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36minclude\u001b[39;49;00m \u001b[37m<pyopencl-complex.h>\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36minclude\u001b[39;49;00m \u001b[37m<pyopencl-random123/philox.cl>\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\n",
      "\n",
      "\n",
      "\u001b[34mtypedef\u001b[39;49;00m \u001b[34munion\u001b[39;49;00m {\n",
      "    uint4 v;\n",
      "    philox4x32_ctr_t c;\n",
      "} philox4x32_ctr_vec_union;\n",
      "\n",
      "\n",
      "uint4 \u001b[32mphilox4x32_bump\u001b[39;49;00m(uint4 ctr)\n",
      "{\n",
      "    \u001b[34mif\u001b[39;49;00m (++ctr.x == \u001b[34m0\u001b[39;49;00m)\n",
      "        \u001b[34mif\u001b[39;49;00m (++ctr.y == \u001b[34m0\u001b[39;49;00m)\n",
      "            ++ctr.z;\n",
      "    \u001b[34mreturn\u001b[39;49;00m ctr;\n",
      "}\n",
      "\n",
      "uint4 \u001b[32mphilox4x32_gen\u001b[39;49;00m(\n",
      "        uint4 ctr,\n",
      "        uint2 key,\n",
      "        uint4 *new_ctr)\n",
      "{\n",
      "    philox4x32_ctr_vec_union result;\n",
      "    result.c = philox4x32(\n",
      "        *(philox4x32_ctr_t *) &ctr,\n",
      "        *(philox4x32_key_t *) &key);\n",
      "    *new_ctr = philox4x32_bump(ctr);\n",
      "    \u001b[34mreturn\u001b[39;49;00m result.v;\n",
      "}\n",
      "\n",
      "float4 \u001b[32mphilox4x32_f32\u001b[39;49;00m(\n",
      "        uint4 ctr,\n",
      "        uint2 key,\n",
      "        uint4 *new_ctr)\n",
      "{\n",
      "    *new_ctr = ctr;\n",
      "    \u001b[34mreturn\u001b[39;49;00m\n",
      "        convert_float4(philox4x32_gen(*new_ctr, key, new_ctr))\n",
      "        * \u001b[34m2.3283064365386963e-10\u001b[39;49;00mf;\n",
      "}\n",
      "\n",
      "double4 \u001b[32mphilox4x32_f64\u001b[39;49;00m(\n",
      "        uint4 ctr,\n",
      "        uint2 key,\n",
      "        uint4 *new_ctr)\n",
      "{\n",
      "    *new_ctr = ctr;\n",
      "        \u001b[34mreturn\u001b[39;49;00m\n",
      "            convert_double4(philox4x32_gen(*new_ctr, key, new_ctr))\n",
      "            * \u001b[34m2.3283064365386963e-10\u001b[39;49;00m\n",
      "            +\n",
      "            convert_double4(philox4x32_gen(*new_ctr, key, new_ctr))\n",
      "            * \u001b[34m5.421010862427522e-20\u001b[39;49;00m;\n",
      "\n",
      "}\n",
      "\n",
      "__kernel \u001b[36mvoid\u001b[39;49;00m \u001b[32m__attribute__\u001b[39;49;00m ((reqd_work_group_size(\u001b[34m128\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(__global \u001b[36mint\u001b[39;49;00m *__restrict__ accepted, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m k1, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m n, __global cdouble_t *__restrict__ samples)\n",
      "{\n",
      "  uint4 ctr;\n",
      "  uint4 dummy;\n",
      "  uint4 insn_1_retval_1;\n",
      "  uint2 key2;\n",
      "  float4 rng_res;\n",
      "\n",
      "  \u001b[34mif\u001b[39;49;00m (-\u001b[34m1\u001b[39;49;00m + -\u001b[34m128\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m)\n",
      "  {\n",
      "    key2 = (uint2) (lid(\u001b[34m0\u001b[39;49;00m) + gid(\u001b[34m0\u001b[39;49;00m) * \u001b[34m128\u001b[39;49;00m, k1);\n",
      "    ctr = (uint4) (\u001b[34m0\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m, \u001b[34m2\u001b[39;49;00m, \u001b[34m3\u001b[39;49;00m);\n",
      "    rng_res = philox4x32_f32(ctr, key2, &(insn_1_retval_1));\n",
      "    dummy = insn_1_retval_1;\n",
      "    samples[\u001b[34m2\u001b[39;49;00m * (\u001b[34m128\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)) + \u001b[34m1\u001b[39;49;00m] = cdouble_radd(rng_res.s2, cdouble_rmul(rng_res.s3, cdouble_new(\u001b[34m0.0\u001b[39;49;00m, \u001b[34m1.0\u001b[39;49;00m)));\n",
      "    samples[\u001b[34m2\u001b[39;49;00m * (\u001b[34m128\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m))] = cdouble_radd(rng_res.s0, cdouble_rmul(rng_res.s1, cdouble_new(\u001b[34m0.0\u001b[39;49;00m, \u001b[34m1.0\u001b[39;49;00m)));\n",
      "    \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m j = \u001b[34m0\u001b[39;49;00m; j <= \u001b[34m1\u001b[39;49;00m; ++j)\n",
      "      accepted[\u001b[34m2\u001b[39;49;00m * (\u001b[34m128\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)) + j] = cdouble_real(cdouble_rmul(\u001b[34m1\u001b[39;49;00m, cdouble_mul(samples[\u001b[34m2\u001b[39;49;00m * (\u001b[34m128\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)) + j], cdouble_conj(samples[\u001b[34m2\u001b[39;49;00m * (\u001b[34m128\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)) + j])))) < \u001b[34m1.0\u001b[39;49;00m;\n",
      "  }\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "evt, result = knl(queue, n=100000, k1=np.int32(99123))\n",
    "\n",
    "samples = result[\"samples\"].reshape(-1)\n",
    "accepted = result[\"accepted\"].reshape(-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0x7fa281a13640>]"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP4AAAD4CAYAAADMz1tMAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAQ8ElEQVR4nO3de4xcZ33G8e+TGHNrSAAvFfKFNWFTWBKE0SgXRSqpgNZJWzsSLdhK1KaKbEEbVCsoklPTAAb+oKjpRXVLHRVxcSB2UIUWeVM3pYkiWdmQTWMCThSyNuDYIHvBgVgx8fry6x8zDpPJbvbszplzmff5SCvPOfPOOT+P/ZzLe855VxGBmaXlnLILMLPiOfhmCXLwzRLk4JslyME3S9CCsla8aNGiGBwcLGv1Zkl45JFHfh4RA53zSwv+4OAg4+PjZa3eLAmSfjLdfB/qmyXIwTdLkINvliAH3yxBDr5Zgmbt1Zf0JeCPgCMRcfE07wv4J+Aa4DhwQ0T8X7eFXXfHg+zedzRTWwHXXb6Mz157SberNUtClst5Xwb+BfjqDO9fDQy1fi4D/q3157zNJfQAAWwbO8C2sQNzWs/Qm17LvTdfNbfizPrArMGPiAckDb5Mk9XAV6P5fO+YpAskvTkifjbfouYS+m48deQ5BjfunPa9630EYX0sjxt4FgNPt00fbM17SfAlrQfWAyxbtiyHVffOTEcQPq2wflDonXsRsRXYCtBoNGo5Ash0pxXeGFjd5BH8Q8DStuklrXnJmG5j8MoF5/D5D76La1csLq8wsxnkcTlvBPgzNV0O/Kqb8/t+ceLUGTZs38Pgxp2s2PzffOvRpLaFVnFZLud9A7gKWCTpIPBJ4BUAEfFFYJTmpbwJmpfz/qJXxdbVM8dPsmH7HjZs3/PCvNe/5hV88o/f6SMCK4XKGmyz0WjETE/nzdTT3q8kuO4y9xFY/iQ9EhGNzvmlPZZrvxHx4j4CX0q0XnPwK+jsRsCnA9YrDn6FdfYNXHnhG7hz3RUlV2X9wA/p1MjufUd9lcBy4eDX0NkjAW8EbL4c/Jo7uxH4wO33l12K1YjP8ftE+wNHiy94Nbf8we+4U9Bm5D1+Hzr0y1+/cCpw2efuLbscqyAHv88dPjbF4MadfOJb3y+7FKsQH+on4uy9AecAt3/43T4NSJz3+Ik5Ay+cBvhqQLoc/IRt2L6Hob/xBiBFDn7iTp5pbgCG//YebwAS4uAbAMdPNscPeNcn/6vsUqwADr69yLMnTjO4cSfX3fFg2aVYDzn4Nq2zzwV4A9CfHHx7WX4wqD85+JbJ2WcCfCNQf3DwbU62jR3grbf6EmDdOfg2Z2fCNwHVnYNvXdmwfY87AGvIwbeu7d53lOWJjYxcdw6+5SJIb1j0OnPwLVeDG3dy0aZRn/tXnINvuZs6Hb70V3EOvvXMtrEDDn9FOfjWU9vGDjC4cacHA60YB98K8dSR53jbre78qwoH3wpzKvD4fxXh4Fvhto0d8E0/JXPwrRS79x310N8lyhR8SSslPSlpQtLGad5fJuk+SY9KekzSNfmXav3m7NDf3vsXb9bgSzoX2AJcDQwDayUNdzT7BLAjIlYAa4B/zbtQ61+79x3l7ZtGyy4jKVn2+JcCExGxPyKmgLuA1R1tAnhd6/X5wE/zK9FS8Pzp8N6/QFmCvxh4um36YGteu08B10s6CIwCH5tuQZLWSxqXND45OTmPcq3f+dy/GHl17q0FvhwRS4BrgK9JesmyI2JrRDQiojEwMJDTqq3fHD425dF+eyxL8A8BS9uml7TmtbsR2AEQEQ8CrwIW5VGgpenZE6f9qG8PZQn+w8CQpOWSFtLsvBvpaHMAeB+ApHfQDL6P5a0rAQ5/j8wa/Ig4BdwE7AKeoNl7v1fSZkmrWs0+DqyT9D3gG8ANERG9KtrScfY5fz/mmy+Vlc9GoxHj4+PTvucBHWw6V174Bu5cd0XZZdSKpEciotE533fuWW24xz8/Dr7VyuFjUw5/Dhx8qx1f7uueg2+15Mt93XHwrbY8su/8OfhWew7/3Dn41hcc/rlx8K1v+Eaf7Bx86ysezz8bB9/6jsfzn52Db33J4X95Dr71LYd/Zg6+9TWHf3oOvvW9bWMH3NvfwcG3JLi3/8UcfEvGtrED/uWdLQ6+JeWpI895CG8cfEvQ7n1Hkz/nd/AtSRu27ym7hFI5+JaslAfzcPAtWc+eOJ3sMF4OviXt8LGpJDv7HHxL3u59R5O7xu/gm5He3X0OvlnLhu17kgm/g2/WJpXwO/hmHVK4xu/gm02j33v6HXyzafT7bb0OvtkM+vmQP1PwJa2U9KSkCUkbZ2jzIUmPS9or6ev5lmlWjrfd2p/j9c8afEnnAluAq4FhYK2k4Y42Q8CtwJUR8U5gQw9qNSvcqejP8/0se/xLgYmI2B8RU8BdwOqONuuALRHxDEBEHMm3TLPy9OOdfVmCvxh4um36YGteu4uAiyTtljQmaeV0C5K0XtK4pPHJycn5VWxWgm1jB8ouIVd5de4tAIaAq4C1wB2SLuhsFBFbI6IREY2BgYGcVm1WjH7a62cJ/iFgadv0kta8dgeBkYg4GRE/An5Ic0Ng1jf66X7+LMF/GBiStFzSQmANMNLR5ls09/ZIWkTz0H9/jnWaVUK/XOKbNfgRcQq4CdgFPAHsiIi9kjZLWtVqtgv4haTHgfuAWyLiF70q2qxMy/vgV3IvyNIoIkaB0Y55t7W9DuDm1o9ZXwua5/ufvfaSskuZN9+5ZzYPde/ld/DN5qnOd/U5+GbzVOe7+hx8sy7U9Sk+B9+sS3W8xOfgm+Wgbnf1OfhmOajbXX0OvllObrm7Pof8Dr5ZTk6eoTZ7fQffLEd16ehz8M1yVoffwuvgm+Xs2ROnK9/L7+Cb9UDV7+V38M0S5OCb9cjbN43O3qgkDr5Zjzx/Oip7ru/gm/VQVc/1HXyzHqvio7sOvlmP7d53tOwSXsLBNytA1c71HXyzAlTt6T0H36wgVbqP38E3K1BVOvocfLMCVaWjz8E3S5CDb1awyz53b9klOPhmRTt8bKr0Hn4H36wEt/7nY6Wu38E3K8GvT54pdf0OvlmCHHyzkpTZyZcp+JJWSnpS0oSkjS/T7oOSQlIjvxLN+tPhY1Ol3cM/a/AlnQtsAa4GhoG1koanaXce8NfAQ3kXadavynpeP8se/1JgIiL2R8QUcBewepp2nwE+DzyfY31m1gNZgr8YeLpt+mBr3gskvQdYGhE7X25BktZLGpc0Pjk5OedizfrRB26/v/B1dt25J+kc4Hbg47O1jYitEdGIiMbAwEC3qzbrC08dea7wG3qyBP8QsLRteklr3lnnARcD90v6MXA5MOIOPrPsPr6j2Ed2swT/YWBI0nJJC4E1wMjZNyPiVxGxKCIGI2IQGANWRcR4Tyo260Ono9hfuDlr8CPiFHATsAt4AtgREXslbZa0qtcFmqXiC7ueLGxdC7I0iohRYLRj3m0ztL2q+7LM0nPol78ubF2+c88sQQ6+WYUUdSefg29WIUXdyefgm1VMEQNyOvhmFVPEgJwOvlmCHHyzCur14b6Db1ZBvT7cd/DNEuTgmyXIwTerqF7ezOPgm1VUL2/mcfDNEuTgm1VYr4blcvDNKuypI8/1ZLkOvlnF9aKTz8E3q7hedPI5+GYJcvDNaiDvgTgdfLMa+PS39+a6PAffrAaeOX4y1+U5+GYJcvDNEuTgm9VEnh18Dr5ZTdxyd36/X8/BN6uJk2fyW5aDb5YgB9+sRvI6z3fwzWpkw/Z8zvMdfLMEZQq+pJWSnpQ0IWnjNO/fLOlxSY9J+o6kt+RfqpnlZdbgSzoX2AJcDQwDayUNdzR7FGhExLuAbwJ/l3ehZpafLHv8S4GJiNgfEVPAXcDq9gYRcV9EHG9NjgFL8i3TzPKUJfiLgafbpg+25s3kRuCe6d6QtF7SuKTxycnJ7FWa2QvyGJEn1849SdcDDeAL070fEVsjohERjYGBgTxXbZaMPEbkWZChzSFgadv0kta8F5H0fmAT8N6IONF1ZWbWM1n2+A8DQ5KWS1oIrAFG2htIWgH8O7AqIo7kX6aZ5WnW4EfEKeAmYBfwBLAjIvZK2ixpVavZF4DfAu6WtEfSyAyLM7McdHuen+VQn4gYBUY75t3W9vr9XVVhZnOybewAn732knl/3nfumSXIwTdLkINvliAH3yxBDr5Zghx8swQ5+GYJcvDNauqyz9077886+GY1dfjY1Lw/6+CbJcjBN0uQg2+WIAffrMbmO86+g29WY5/+9t55fc7BN6uxZ46fnNfnHHyzBDn4Zgly8M0S5OCbJcjBN0uQg2+WIAffLEEOvlmCHHyzBDn4Zgly8M1qbj4P6jj4ZjU3nwd1HHyzmpvPgzoOvlmCHHyzBDn4ZgnKFHxJKyU9KWlC0sZp3n+lpO2t9x+SNJh3oWaWn1mDL+lcYAtwNTAMrJU03NHsRuCZiHgb8A/A5/Mu1Mzyk2WPfykwERH7I2IKuAtY3dFmNfCV1utvAu+TpPzKNLM8ZQn+YuDptumDrXnTtomIU8CvgDd2LkjSeknjksYnJyfnV7GZda3Qzr2I2BoRjYhoDAwMzNjuygvfUGBVZvU2n7xkCf4hYGnb9JLWvGnbSFoAnA/8Ys7VtNy57gqG3vTa+X7cLBm/fd5C7lx3xZw/lyX4DwNDkpZLWgisAUY62owAf956/SfA/0ZEzLmaNvfefBX/+OF3s/iCV3ezmEpxp4flRcD1ly/joU0fmNfnF8zWICJOSboJ2AWcC3wpIvZK2gyMR8QI8B/A1yRNAEdpbhy6du2KxVy7orM7wcy6NWvwASJiFBjtmHdb2+vngT/NtzQz6xXfuWeWIAffLEEOvlmCHHyzBKnLq27zX7E0CfwkQ9NFwM97XE43qlxflWuDatdX5doge31viYiX3C1XWvCzkjQeEY2y65hJleurcm1Q7fqqXBt0X58P9c0S5OCbJagOwd9adgGzqHJ9Va4Nql1flWuDLuur/Dm+meWvDnt8M8uZg2+WoMoEv8oDemao7WZJj0t6TNJ3JL2lqNqy1NfW7oOSQlJhl6my1CbpQ63vb6+krxdVW5b6JC2TdJ+kR1v/vtcUWNuXJB2R9IMZ3pekf27V/pik92ReeESU/kPzcd99wFuBhcD3gOGONn8JfLH1eg2wvUK1/R7wmtbrjxZVW9b6Wu3OAx4AxoBGVWoDhoBHgde3pt9Upe+OZifaR1uvh4EfF1jf7wLvAX4ww/vXAPfQfDz/cuChrMuuyh6/ygN6zlpbRNwXEcdbk2M0RykqSpbvDuAzNEc/fr5ita0DtkTEMwARcaRi9QXwutbr84GfFlVcRDxAc3yLmawGvhpNY8AFkt6cZdlVCX5uA3qWVFu7G2luhYsya32tQ8ClEbGzwLog23d3EXCRpN2SxiStLKy6bPV9Crhe0kGaY1J8rJjSMpnr/80XZBqIw7KRdD3QAN5bdi1nSToHuB24oeRSZrKA5uH+VTSPlB6QdElE/LLUqn5jLfDliPh7SVfQHGnq4og4U3Zh3ajKHr/wAT1zrg1J7wc2Aasi4kQBdZ01W33nARcD90v6Mc1zwZGCOviyfHcHgZGIOBkRPwJ+SHNDUIQs9d0I7ACIiAeBV9F8QKYKMv3fnFZRHRWzdGIsAPYDy/lNJ8s7O9r8FS/u3NtRodpW0OwkGqrid9fR/n6K69zL8t2tBL7Ser2I5qHrGytU3z3ADa3X76B5jq8C/30Hmblz7w95cefedzMvt6i/QIa/4DU0t/b7gE2teZtp7kGhuaW9G5gAvgu8tUK1/Q9wGNjT+hmp0nfX0baw4Gf87kTzVORx4PvAmip9dzR78ne3Ngp7gN8vsLZvAD8DTtI8MroR+Ajwkbbvbkur9u/P5d/Vt+yaJagq5/hmViAH3yxBDr5Zghx8swQ5+GYJcvDNEuTgmyXo/wFUoZhfY0TtQQAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as pt\n",
    "\n",
    "samples_host = samples.get()\n",
    "accepted_host = accepted.get() != 0\n",
    "\n",
    "pt.gca().set_aspect(\"equal\")\n",
    "pt.plot(\n",
    "    samples_host[accepted_host].real,\n",
    "    samples_host[accepted_host].imag, \"o\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Lastly, we compute the ratio of accepted to the total number of samples to get our approximate value of $\\pi$:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3.14292"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "4 * cl.array.sum(accepted).get() / len(samples)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We (roughly) expect convergence as $1/\\sqrt N$, so this gives an idea of the relative error to expect:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.00223606797749979"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "1/np.sqrt(len(samples))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Direct Reduction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mif __OPENCL_C_VERSION__ < 120\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mpragma OPENCL EXTENSION cl_khr_fp64: enable\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mendif\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine PYOPENCL_DEFINE_CDOUBLE\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36minclude\u001b[39;49;00m \u001b[37m<pyopencl-complex.h>\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36minclude\u001b[39;49;00m \u001b[37m<pyopencl-random123/philox.cl>\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\n",
      "\n",
      "\n",
      "\u001b[34mtypedef\u001b[39;49;00m \u001b[34munion\u001b[39;49;00m {\n",
      "    uint4 v;\n",
      "    philox4x32_ctr_t c;\n",
      "} philox4x32_ctr_vec_union;\n",
      "\n",
      "\n",
      "uint4 \u001b[32mphilox4x32_bump\u001b[39;49;00m(uint4 ctr)\n",
      "{\n",
      "    \u001b[34mif\u001b[39;49;00m (++ctr.x == \u001b[34m0\u001b[39;49;00m)\n",
      "        \u001b[34mif\u001b[39;49;00m (++ctr.y == \u001b[34m0\u001b[39;49;00m)\n",
      "            ++ctr.z;\n",
      "    \u001b[34mreturn\u001b[39;49;00m ctr;\n",
      "}\n",
      "\n",
      "uint4 \u001b[32mphilox4x32_gen\u001b[39;49;00m(\n",
      "        uint4 ctr,\n",
      "        uint2 key,\n",
      "        uint4 *new_ctr)\n",
      "{\n",
      "    philox4x32_ctr_vec_union result;\n",
      "    result.c = philox4x32(\n",
      "        *(philox4x32_ctr_t *) &ctr,\n",
      "        *(philox4x32_key_t *) &key);\n",
      "    *new_ctr = philox4x32_bump(ctr);\n",
      "    \u001b[34mreturn\u001b[39;49;00m result.v;\n",
      "}\n",
      "\n",
      "float4 \u001b[32mphilox4x32_f32\u001b[39;49;00m(\n",
      "        uint4 ctr,\n",
      "        uint2 key,\n",
      "        uint4 *new_ctr)\n",
      "{\n",
      "    *new_ctr = ctr;\n",
      "    \u001b[34mreturn\u001b[39;49;00m\n",
      "        convert_float4(philox4x32_gen(*new_ctr, key, new_ctr))\n",
      "        * \u001b[34m2.3283064365386963e-10\u001b[39;49;00mf;\n",
      "}\n",
      "\n",
      "double4 \u001b[32mphilox4x32_f64\u001b[39;49;00m(\n",
      "        uint4 ctr,\n",
      "        uint2 key,\n",
      "        uint4 *new_ctr)\n",
      "{\n",
      "    *new_ctr = ctr;\n",
      "        \u001b[34mreturn\u001b[39;49;00m\n",
      "            convert_double4(philox4x32_gen(*new_ctr, key, new_ctr))\n",
      "            * \u001b[34m2.3283064365386963e-10\u001b[39;49;00m\n",
      "            +\n",
      "            convert_double4(philox4x32_gen(*new_ctr, key, new_ctr))\n",
      "            * \u001b[34m5.421010862427522e-20\u001b[39;49;00m;\n",
      "\n",
      "}\n",
      "\n",
      "__kernel \u001b[36mvoid\u001b[39;49;00m \u001b[32m__attribute__\u001b[39;49;00m ((reqd_work_group_size(\u001b[34m128\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(\u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m nl, __global \u001b[36mint\u001b[39;49;00m *__restrict__ result, __global \u001b[36mint\u001b[39;49;00m *__restrict__ tmp)\n",
      "{\n",
      "  __local \u001b[36mint\u001b[39;49;00m acc_l_inner[\u001b[34m128\u001b[39;49;00m];\n",
      "  \u001b[36mint\u001b[39;49;00m acc_l_outer;\n",
      "  \u001b[36mint\u001b[39;49;00m accepted[\u001b[34m2\u001b[39;49;00m];\n",
      "  uint4 ctr;\n",
      "  uint4 dummy;\n",
      "  uint4 insn_1_retval_1;\n",
      "  uint2 key2;\n",
      "  \u001b[36mint\u001b[39;49;00m neutral_l_inner;\n",
      "  float4 rng_res;\n",
      "  cdouble_t samples[\u001b[34m2\u001b[39;49;00m];\n",
      "\n",
      "  \u001b[34mif\u001b[39;49;00m (-\u001b[34m1\u001b[39;49;00m + nl >= \u001b[34m0\u001b[39;49;00m)\n",
      "  {\n",
      "    \u001b[34mif\u001b[39;49;00m (-\u001b[34m1\u001b[39;49;00m + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + nl >= \u001b[34m0\u001b[39;49;00m)\n",
      "      acc_l_outer = \u001b[34m0\u001b[39;49;00m;\n",
      "    neutral_l_inner = \u001b[34m0\u001b[39;49;00m;\n",
      "    acc_l_inner[lid(\u001b[34m0\u001b[39;49;00m)] = \u001b[34m0\u001b[39;49;00m;\n",
      "    ctr = (uint4) (\u001b[34m0\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m, \u001b[34m2\u001b[39;49;00m, \u001b[34m3\u001b[39;49;00m);\n",
      "    \u001b[34mif\u001b[39;49;00m (-\u001b[34m1\u001b[39;49;00m + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + nl >= \u001b[34m0\u001b[39;49;00m)\n",
      "    {\n",
      "      \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m l_outer = \u001b[34m0\u001b[39;49;00m; l_outer <= -\u001b[34m1\u001b[39;49;00m + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + (\u001b[34m127\u001b[39;49;00m + nl + \u001b[34m127\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m)) / \u001b[34m128\u001b[39;49;00m; ++l_outer)\n",
      "      {\n",
      "        key2 = (uint2) (nl * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m) + l_outer * \u001b[34m128\u001b[39;49;00m, \u001b[34m1234\u001b[39;49;00m);\n",
      "        rng_res = philox4x32_f32(ctr, key2, &(insn_1_retval_1));\n",
      "        dummy = insn_1_retval_1;\n",
      "        samples[\u001b[34m1\u001b[39;49;00m] = cdouble_radd(rng_res.s2, cdouble_rmul(rng_res.s3, cdouble_new(\u001b[34m0.0\u001b[39;49;00m, \u001b[34m1.0\u001b[39;49;00m)));\n",
      "        samples[\u001b[34m0\u001b[39;49;00m] = cdouble_radd(rng_res.s0, cdouble_rmul(rng_res.s1, cdouble_new(\u001b[34m0.0\u001b[39;49;00m, \u001b[34m1.0\u001b[39;49;00m)));\n",
      "        \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m isamp = \u001b[34m0\u001b[39;49;00m; isamp <= \u001b[34m1\u001b[39;49;00m; ++isamp)\n",
      "          accepted[isamp] = cdouble_real(cdouble_rmul(\u001b[34m1\u001b[39;49;00m, cdouble_mul(samples[isamp], cdouble_conj(samples[isamp])))) < \u001b[34m1.0\u001b[39;49;00m;\n",
      "        acc_l_outer = acc_l_outer + accepted[\u001b[34m0\u001b[39;49;00m] + accepted[\u001b[34m1\u001b[39;49;00m];\n",
      "      }\n",
      "      acc_l_inner[lid(\u001b[34m0\u001b[39;49;00m)] = neutral_l_inner + acc_l_outer;\n",
      "    }\n",
      "    barrier(CLK_LOCAL_MEM_FENCE) \u001b[37m/* for acc_l_inner (red_l_inner_stage_0 depends on insn_3_l_inner_init) */\u001b[39;49;00m;\n",
      "    \u001b[34mif\u001b[39;49;00m (\u001b[34m63\u001b[39;49;00m + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) >= \u001b[34m0\u001b[39;49;00m)\n",
      "      acc_l_inner[lid(\u001b[34m0\u001b[39;49;00m)] = acc_l_inner[lid(\u001b[34m0\u001b[39;49;00m)] + acc_l_inner[\u001b[34m64\u001b[39;49;00m + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "    barrier(CLK_LOCAL_MEM_FENCE) \u001b[37m/* for acc_l_inner (red_l_inner_stage_1 depends on red_l_inner_stage_0) */\u001b[39;49;00m;\n",
      "    \u001b[34mif\u001b[39;49;00m (\u001b[34m31\u001b[39;49;00m + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) >= \u001b[34m0\u001b[39;49;00m)\n",
      "      acc_l_inner[lid(\u001b[34m0\u001b[39;49;00m)] = acc_l_inner[lid(\u001b[34m0\u001b[39;49;00m)] + acc_l_inner[\u001b[34m32\u001b[39;49;00m + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "    barrier(CLK_LOCAL_MEM_FENCE) \u001b[37m/* for acc_l_inner (red_l_inner_stage_2 depends on red_l_inner_stage_1) */\u001b[39;49;00m;\n",
      "    \u001b[34mif\u001b[39;49;00m (\u001b[34m15\u001b[39;49;00m + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) >= \u001b[34m0\u001b[39;49;00m)\n",
      "      acc_l_inner[lid(\u001b[34m0\u001b[39;49;00m)] = acc_l_inner[lid(\u001b[34m0\u001b[39;49;00m)] + acc_l_inner[\u001b[34m16\u001b[39;49;00m + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "    barrier(CLK_LOCAL_MEM_FENCE) \u001b[37m/* for acc_l_inner (red_l_inner_stage_3 depends on red_l_inner_stage_2) */\u001b[39;49;00m;\n",
      "    \u001b[34mif\u001b[39;49;00m (\u001b[34m7\u001b[39;49;00m + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) >= \u001b[34m0\u001b[39;49;00m)\n",
      "      acc_l_inner[lid(\u001b[34m0\u001b[39;49;00m)] = acc_l_inner[lid(\u001b[34m0\u001b[39;49;00m)] + acc_l_inner[\u001b[34m8\u001b[39;49;00m + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "    barrier(CLK_LOCAL_MEM_FENCE) \u001b[37m/* for acc_l_inner (red_l_inner_stage_4 depends on red_l_inner_stage_3) */\u001b[39;49;00m;\n",
      "    \u001b[34mif\u001b[39;49;00m (\u001b[34m3\u001b[39;49;00m + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) >= \u001b[34m0\u001b[39;49;00m)\n",
      "      acc_l_inner[lid(\u001b[34m0\u001b[39;49;00m)] = acc_l_inner[lid(\u001b[34m0\u001b[39;49;00m)] + acc_l_inner[\u001b[34m4\u001b[39;49;00m + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "    barrier(CLK_LOCAL_MEM_FENCE) \u001b[37m/* for acc_l_inner (red_l_inner_stage_5 depends on red_l_inner_stage_4) */\u001b[39;49;00m;\n",
      "    \u001b[34mif\u001b[39;49;00m (\u001b[34m1\u001b[39;49;00m + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) >= \u001b[34m0\u001b[39;49;00m)\n",
      "      acc_l_inner[lid(\u001b[34m0\u001b[39;49;00m)] = acc_l_inner[lid(\u001b[34m0\u001b[39;49;00m)] + acc_l_inner[\u001b[34m2\u001b[39;49;00m + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "    barrier(CLK_LOCAL_MEM_FENCE) \u001b[37m/* for acc_l_inner (red_l_inner_stage_6 depends on red_l_inner_stage_5) */\u001b[39;49;00m;\n",
      "    \u001b[34mif\u001b[39;49;00m (lid(\u001b[34m0\u001b[39;49;00m) == \u001b[34m0\u001b[39;49;00m)\n",
      "      acc_l_inner[\u001b[34m0\u001b[39;49;00m] = acc_l_inner[\u001b[34m0\u001b[39;49;00m] + acc_l_inner[\u001b[34m1\u001b[39;49;00m];\n",
      "    tmp[gid(\u001b[34m0\u001b[39;49;00m)] = acc_l_inner[\u001b[34m0\u001b[39;49;00m];\n",
      "  }\n",
      "}\n",
      "\n",
      "__kernel \u001b[36mvoid\u001b[39;49;00m \u001b[32m__attribute__\u001b[39;49;00m ((reqd_work_group_size(\u001b[34m50\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel_0(\u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m nl, __global \u001b[36mint\u001b[39;49;00m *__restrict__ result, __global \u001b[36mint\u001b[39;49;00m *__restrict__ tmp)\n",
      "{\n",
      "  __local \u001b[36mint\u001b[39;49;00m acc_j[\u001b[34m50\u001b[39;49;00m];\n",
      "  \u001b[36mint\u001b[39;49;00m neutral_j;\n",
      "\n",
      "  neutral_j = \u001b[34m0\u001b[39;49;00m;\n",
      "  acc_j[lid(\u001b[34m0\u001b[39;49;00m)] = \u001b[34m0\u001b[39;49;00m;\n",
      "  \u001b[34mif\u001b[39;49;00m (-\u001b[34m1\u001b[39;49;00m + nl >= \u001b[34m0\u001b[39;49;00m)\n",
      "    acc_j[lid(\u001b[34m0\u001b[39;49;00m)] = neutral_j + tmp[lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "  barrier(CLK_LOCAL_MEM_FENCE) \u001b[37m/* for acc_j (red_j_stage_0 depends on finalred_j_init) */\u001b[39;49;00m;\n",
      "  \u001b[34mif\u001b[39;49;00m (\u001b[34m17\u001b[39;49;00m + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) >= \u001b[34m0\u001b[39;49;00m)\n",
      "    acc_j[lid(\u001b[34m0\u001b[39;49;00m)] = acc_j[lid(\u001b[34m0\u001b[39;49;00m)] + acc_j[\u001b[34m32\u001b[39;49;00m + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "  barrier(CLK_LOCAL_MEM_FENCE) \u001b[37m/* for acc_j (red_j_stage_1 depends on red_j_stage_0) */\u001b[39;49;00m;\n",
      "  \u001b[34mif\u001b[39;49;00m (\u001b[34m15\u001b[39;49;00m + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) >= \u001b[34m0\u001b[39;49;00m)\n",
      "    acc_j[lid(\u001b[34m0\u001b[39;49;00m)] = acc_j[lid(\u001b[34m0\u001b[39;49;00m)] + acc_j[\u001b[34m16\u001b[39;49;00m + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "  barrier(CLK_LOCAL_MEM_FENCE) \u001b[37m/* for acc_j (red_j_stage_2 depends on red_j_stage_1) */\u001b[39;49;00m;\n",
      "  \u001b[34mif\u001b[39;49;00m (\u001b[34m7\u001b[39;49;00m + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) >= \u001b[34m0\u001b[39;49;00m)\n",
      "    acc_j[lid(\u001b[34m0\u001b[39;49;00m)] = acc_j[lid(\u001b[34m0\u001b[39;49;00m)] + acc_j[\u001b[34m8\u001b[39;49;00m + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "  barrier(CLK_LOCAL_MEM_FENCE) \u001b[37m/* for acc_j (red_j_stage_3 depends on red_j_stage_2) */\u001b[39;49;00m;\n",
      "  \u001b[34mif\u001b[39;49;00m (\u001b[34m3\u001b[39;49;00m + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) >= \u001b[34m0\u001b[39;49;00m)\n",
      "    acc_j[lid(\u001b[34m0\u001b[39;49;00m)] = acc_j[lid(\u001b[34m0\u001b[39;49;00m)] + acc_j[\u001b[34m4\u001b[39;49;00m + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "  barrier(CLK_LOCAL_MEM_FENCE) \u001b[37m/* for acc_j (red_j_stage_4 depends on red_j_stage_3) */\u001b[39;49;00m;\n",
      "  \u001b[34mif\u001b[39;49;00m (\u001b[34m1\u001b[39;49;00m + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) >= \u001b[34m0\u001b[39;49;00m)\n",
      "    acc_j[lid(\u001b[34m0\u001b[39;49;00m)] = acc_j[lid(\u001b[34m0\u001b[39;49;00m)] + acc_j[\u001b[34m2\u001b[39;49;00m + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "  barrier(CLK_LOCAL_MEM_FENCE) \u001b[37m/* for acc_j (red_j_stage_5 depends on red_j_stage_4) */\u001b[39;49;00m;\n",
      "  \u001b[34mif\u001b[39;49;00m (lid(\u001b[34m0\u001b[39;49;00m) == \u001b[34m0\u001b[39;49;00m)\n",
      "    acc_j[\u001b[34m0\u001b[39;49;00m] = acc_j[\u001b[34m0\u001b[39;49;00m] + acc_j[\u001b[34m1\u001b[39;49;00m];\n",
      "  result[\u001b[34m0\u001b[39;49;00m] = acc_j[\u001b[34m0\u001b[39;49;00m];\n",
      "}\n",
      "\n",
      "3.1417216 0.1 billion samples\n"
     ]
    }
   ],
   "source": [
    "knl = lp.make_kernel(\n",
    "        \"{ [l, g, j, isamp]: 0<=l<nl and 0<=g,j<ng and 0<=isamp< 2}\",\n",
    "        \"\"\"\n",
    "        <> ctr = make_uint4(0, 1, 2, 3)\n",
    "        for g\n",
    "            for l\n",
    "                <> key2 = make_uint2(l + nl*g, 1234)\n",
    "                <> rng_res, <> dummy = philox4x32_f32(ctr, key2)\n",
    "\n",
    "                <> samples[0] = rng_res.s0 + 1j*rng_res.s1     {id=samp1}\n",
    "                   samples[1] = rng_res.s2 + 1j*rng_res.s3     {id=samp2}\n",
    "\n",
    "                <> accepted[isamp] = real(samples[isamp] * conj(samples[isamp])) < 1 \\\n",
    "                                                                {dep=samp1:samp2} \n",
    "            end\n",
    "        \n",
    "            <> tmp[g] = sum(l, accepted[0] + accepted[1]) \n",
    "        end\n",
    "        ... gbarrier {id=barr,dep_query=writes:tmp}\n",
    "        result = sum(j, tmp[j]) {id=finalred,dep=barr}\n",
    "        \"\"\")                                                                     \n",
    "\n",
    "size = 1000000\n",
    "ng = 50\n",
    "\n",
    "knl = lp.fix_parameters(knl, ng=ng)                                               \n",
    "\n",
    "knl = lp.set_options(knl, write_cl=True, highlight_cl=True)                                          \n",
    "\n",
    "ref_knl = knl                                                                     \n",
    "\n",
    "knl = lp.split_iname(knl, \"l\", 128, inner_tag=\"l.0\")\n",
    "knl = lp.split_reduction_outward(knl, \"l_inner\")\n",
    "knl = lp.tag_inames(knl, \"g:g.0,j:l.0\")\n",
    "knl = lp.preprocess_kernel(knl)\n",
    "knl = lp.add_dependency(knl, \"id:finalred_j_init\", \"id:barr\")\n",
    "knl = lp.add_dependency(knl, \"id:finalred_j_init_neutral\", \"id:barr\")\n",
    "\n",
    "evt, (result,) = knl(queue, nl=size) \n",
    "\n",
    "nsamples = size*2*ng\n",
    "print(4*result.get()/nsamples, nsamples/1e9, \"billion samples\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
